import scrapy
import requests
import re
from ..items import ZhihuItem

class Zhihu1Spider(scrapy.Spider):
    name = 'zhihu1'
    #allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.oalib.com/search?type=0&oldType=0&kw=high+blood+pressure&searchField=All&__multiselect_searchField=&fromYear=&toYear=&pageNo=41']

    num = 7
    def parse(self, response):

        #//*[@id="form1"]/div/center/table/tbody/tr[2]/td/table/tbody/tr/td[2]/div[3]/table
        #//*[@id="paperContent"]/tbody/tr/td/span[1]/a[1]/u
        trs = response.xpath('//*[@id="form1"]/div/center/table/tbody/tr[2]/td/table//tr/td[2]/div[3]/table//tr')
        #print(trs)
        for tr in trs:
            pdf_urls = tr.xpath('./td/span[1]/a[2]/@href')
            if len(pdf_urls) != 0:
                item = ZhihuItem()
                name = tr.xpath('./td/span[1]/a[1]//text()').extract()
                # print(pdf_urls.extract_first())
                name = ''.join(name)
                name = re.sub('[/]', '', name)
                name = name + '.pdf'
                print(name)
                # print('-----------')
                str_pdf = pdf_urls.extract_first()
                session = requests.Session()
                res = session.get(str_pdf)
                res.encoding = res.apparent_encoding
                with open('/Users/xiexiaohao/PycharmProjects/pythonProject/zhihu/zhihu/spiders/PDF1/' + name, 'wb') as f:
                    f.write(res.content)

                #item['file_url'] = str_pdf
                #item['file_name'] = name
                #yield item
        if self.num <= 50:
            self.num += 1
            new_url = 'https://www.oalib.com/search?type=0&oldType=0&kw=high+blood+pressure&searchField=All&__multiselect_searchField=&fromYear=&toYear=&pageNo=' + str(self.num)
            yield scrapy.Request(url=new_url, callback=self.parse)


